Comparing My Output to Das Java Output

NIPS Dataset with 50 Topics


In [1]:
import numpy as np
import os
from operator import itemgetter
from collections import Counter
import scipy.stats as stat
from gensim.models import Word2Vec
from nltk import corpus

Using 50-dim word vectors trained on wikipedia


In [2]:
wvmodel = Word2Vec.load_word2vec_format(
    "/Users/michael/Documents/Gaussian_LDA-master/data/glove.wiki/glove.6B.50d.txt", binary=False)
print "word-vector dimension: {}".format(wvmodel.vector_size)


WARNING:gensim.models.word2vec:consider setting layer size to a multiple of 4 for greater performance
word-vector dimension: 50

Routine cleaning

stopwords and words in language model


In [3]:
wv_vocab = set(wvmodel.vocab.keys())
stops = set(corpus.stopwords.words(fileids="english"))

In [4]:
corpus = []
nips_path = "/Users/michael/Documents/GaussianLDA/data/"
for folder in os.listdir(nips_path)[1:]:
    for doc in os.listdir(nips_path + folder):
        with open(nips_path + folder + "/" + doc, 'r') as f:
            txt = f.read().split()
            txt = map(lambda x: x.lower(), txt)  # Lowercasing each word
            txt = filter(lambda word: [letter for letter in word if ord(letter) < 128 ], txt)  # Checking each word for ascci error
            txt = filter(lambda x: x not in stops, txt)  # Removing stop words
            txt = filter(lambda x: x.isalpha(), txt)  # Removing non-letter words (eg numbers and symbols)
            txt = filter(lambda x: len(x) > 2, txt)  # removing super short words and single letters
            txt = filter(lambda x: x in wv_vocab, txt)    
            txt = ' '.join(txt)
            corpus.append(txt)
            f.close()

In [5]:
words = []
for doc in corpus:
    for word in doc.split():
        words.append(word)
c = Counter(words)
vocab = set(words)

Removing very common words


In [6]:
temp_corp = []
common_words = zip(*c.most_common(n=50))[0]
for doc in corpus:
    doc = doc.split()
    doc = filter(lambda x: x not in common_words, doc)
    doc = " ".join(doc)
    temp_corp.append(doc)
corpus = temp_corp

Less words now!


In [7]:
print len(list(vocab))
print len(words)
vocab = list(vocab)


9007
116819

Shaping data for the java model

It requires strange files where words are indexed, and we provide the word-vectors corropsonding to the word indicies.


In [8]:
wordids = {word: i for i, word in enumerate(vocab)}

In [9]:
vecs = {i: wvmodel[word] for word, i in wordids.iteritems()}

In [10]:
index_corpus = []
for doc in corpus:
    docids = []
    for word in doc.split():
        docids.append(wordids[word])
    index_corpus.append(docids)

Sanity checking output - list (corpus) of lists(docs) of ints(words)


In [14]:
index_corpus[:2]


Out[14]:
[[7942,
  2469,
  2542,
  4611,
  3594,
  1744,
  1453,
  1572,
  3379,
  3482,
  6915,
  4678,
  3163,
  6690,
  4794,
  6852,
  7264,
  1221,
  7412,
  6399,
  1445,
  7616,
  4899,
  2016,
  4787,
  931,
  3379,
  7986,
  3335,
  1427,
  4553,
  8979,
  8199,
  8341,
  3347,
  8887,
  1856,
  1367,
  5649,
  3379,
  6503,
  697,
  2207,
  5727,
  4970,
  3764,
  4441,
  1676,
  6061,
  2578,
  2799,
  5832,
  1857,
  428,
  2542,
  5226,
  8985,
  2804,
  2937,
  7421,
  1510,
  3268,
  3322,
  3897,
  8957,
  4812,
  8289,
  6908,
  1427,
  7346,
  6912,
  4292,
  2799,
  5478,
  3773,
  6262,
  7780,
  2804,
  2937,
  8690,
  1895,
  5008,
  7780,
  8655,
  3166,
  6026,
  2578,
  486,
  3349,
  2469,
  7101,
  3166,
  962,
  7974,
  5383,
  2307,
  4576,
  2189,
  2469,
  1329,
  5952,
  1521,
  2048,
  2982,
  3296,
  2318,
  7745,
  6027,
  1257,
  4010,
  3600,
  5374,
  7877,
  2542,
  2469,
  4441,
  605,
  8483,
  2426,
  3335,
  365,
  7942,
  5727,
  8341,
  2207,
  2996,
  6026,
  8757,
  5678,
  4393,
  2469,
  5959,
  6690,
  6915,
  2882,
  6399,
  1445,
  7616,
  2152,
  4259,
  1433,
  931,
  1701,
  3166,
  3816,
  2016,
  4554,
  8840,
  7113,
  2822,
  5870,
  4999,
  7220,
  5997,
  3435,
  1147,
  598,
  1453,
  1572,
  6439,
  8617,
  4117,
  8617,
  3587,
  4464,
  742,
  3611,
  5152,
  3696,
  1643,
  7927,
  7264,
  5870,
  250,
  4999,
  8745,
  7101,
  5914,
  7942,
  5727,
  7156,
  4463,
  2985,
  4277,
  4259,
  5047,
  7859,
  4526,
  2822,
  1491,
  825,
  7551,
  4259,
  6432,
  5727,
  4429,
  5912,
  7156,
  4815,
  5969,
  1445,
  2499,
  1892,
  571,
  5764,
  4277,
  4183,
  5727,
  3076,
  4724,
  652,
  2469,
  2542,
  3969,
  8071,
  5305,
  2815,
  2822,
  4062,
  365,
  7156,
  2770,
  3969,
  7220,
  5800,
  4667,
  8412,
  4277,
  6861,
  2346,
  8199,
  1581,
  8975,
  8094,
  5398,
  7274,
  8745,
  7942,
  5478,
  4724,
  652,
  4688,
  5192,
  4463,
  7889,
  4556,
  8745,
  3347,
  5383,
  7929,
  4393,
  2113,
  3335,
  4441,
  605,
  1427,
  8909,
  6399,
  3347,
  1856,
  2578,
  2634,
  8453,
  2207,
  2967,
  2307,
  1420,
  1660,
  7812,
  2686,
  5832,
  7365,
  1149,
  3172,
  3727,
  2694,
  1754,
  1996,
  6026,
  2770,
  4171,
  2469,
  806,
  1381,
  3738,
  122,
  3926,
  3738,
  2469,
  2214,
  5626,
  4713,
  5692,
  8745,
  7942,
  5727,
  3335,
  2660,
  1427,
  5715,
  4468,
  3335,
  7156,
  4794,
  104,
  4576,
  2469,
  3894,
  4235,
  8115,
  8529,
  5785,
  4842,
  929,
  2955,
  2360,
  5251,
  132,
  5800,
  7838,
  2770,
  3335,
  7156,
  2754,
  6140,
  7942,
  5727,
  3727,
  6861,
  3656,
  4235,
  778,
  2985,
  2282,
  5540,
  5065,
  778,
  5348,
  4393,
  724,
  566,
  2244,
  5727,
  5510,
  2469,
  4794,
  104,
  8483,
  1427,
  3046,
  4921,
  4089,
  8953,
  4587,
  4543,
  8660,
  7518,
  7983,
  5715,
  7738,
  4794,
  104,
  8191,
  4999,
  1857,
  4130,
  2798,
  8653,
  2799,
  4553,
  8587,
  7448,
  3166,
  6917,
  2244,
  1665,
  6623,
  4553,
  3278,
  6623,
  823,
  2244,
  6345,
  4615,
  1816,
  8729,
  3025,
  392,
  2213,
  4553,
  3729,
  4387,
  1701,
  459,
  3347,
  2882,
  2244,
  250,
  4030,
  1197,
  7459,
  2523,
  6961,
  8525,
  1228,
  2493,
  2523,
  8008,
  3881,
  605,
  4130,
  2798,
  4259,
  4794,
  1157,
  6831,
  2804,
  7738,
  3409,
  5847,
  3087,
  8745,
  8729,
  1720,
  3738,
  1435,
  4901,
  3025,
  2213,
  3250,
  2781,
  2469,
  3547,
  7630,
  2869,
  2213,
  8811,
  2213,
  8414,
  3004,
  8989,
  3046,
  4492,
  7809,
  2798,
  7688,
  8154,
  8953,
  7708,
  711,
  2160,
  8710,
  8953,
  1816,
  8729,
  4030,
  2469,
  1427,
  1194,
  3046,
  843,
  5434,
  2559,
  1441,
  2106,
  4787,
  486,
  8729,
  5788,
  5156,
  7117,
  1149,
  7701,
  4577,
  7779,
  7805,
  664,
  8729,
  4339,
  2213,
  174,
  7117,
  1517,
  3025,
  2869,
  7350,
  8729,
  5293,
  7350,
  7942,
  8341,
  3347,
  2050,
  5489,
  4105,
  6877,
  2540,
  5547,
  3037,
  2404,
  2788,
  1097,
  6356,
  5489,
  4000,
  2050,
  2970,
  5489,
  4577,
  1149,
  8551,
  6273,
  8341,
  1430,
  3046,
  7172,
  4685,
  65,
  2000,
  65,
  5489,
  8530,
  3347,
  8156,
  2000,
  1430,
  6116,
  65,
  2000,
  65,
  4262,
  6010,
  823,
  6896,
  1279,
  3409,
  65,
  1793,
  8590,
  1430,
  7829,
  2957,
  5489,
  6604,
  856,
  8436,
  8919,
  8506,
  823,
  2559,
  7825,
  5785,
  441,
  3037,
  661,
  5547,
  3881,
  6991,
  2113,
  5913,
  4149,
  2844,
  4367,
  8590,
  3881,
  3881,
  3881,
  1430,
  4327,
  8341,
  2559,
  1430,
  8801,
  3046,
  325,
  7172,
  8341,
  2226,
  4056,
  5738,
  7577,
  843,
  4367,
  8590,
  7942,
  7156,
  3497,
  823,
  8374,
  8454,
  8341,
  7779,
  4367,
  8590,
  8897,
  7156,
  625,
  8341,
  7942,
  4587,
  1856,
  7942,
  4587,
  8801,
  8341,
  6915,
  5434,
  3524,
  7156,
  8587,
  490,
  2786,
  3476,
  3347,
  8156,
  8094,
  3349,
  5047,
  2828,
  3166,
  3161,
  3040,
  2499,
  7806,
  7132,
  4576,
  592,
  402,
  3060,
  592,
  6119,
  6152,
  8801,
  3894,
  5403,
  4393,
  6503,
  5383,
  6881,
  5374,
  123,
  5489,
  7942,
  4576,
  2469,
  3166,
  5403,
  2770,
  59,
  3037,
  5738,
  7180,
  7156,
  8532,
  2937,
  7812,
  3335,
  7908,
  6560,
  8199,
  7156,
  2493,
  2523,
  2207,
  3524,
  7172,
  6428,
  2207,
  6399,
  5870,
  4999,
  7927,
  6623,
  1433,
  1659,
  162,
  4393,
  7447,
  2634,
  1157,
  2207,
  5293,
  3930,
  3335,
  3409,
  950,
  3570,
  5565,
  7927,
  8080,
  5853,
  65,
  8525,
  2385,
  5780,
  7459,
  2493,
  102,
  3335,
  4387,
  6197,
  7931,
  4724,
  8525,
  2207,
  3764,
  7459,
  5428,
  5532,
  4272,
  4667,
  4429,
  5780,
  364,
  7375,
  8941,
  4563,
  5727,
  4563,
  3037,
  7779,
  2543,
  2881,
  6123,
  954,
  4049,
  364,
  7948,
  2207,
  7387,
  3613,
  2804,
  7617,
  8520,
  6027,
  2267,
  2678,
  6029,
  4904,
  8520,
  6581,
  3555,
  2881,
  5853,
  5969,
  4656,
  7791,
  2476,
  5188,
  2000,
  6877,
  5004,
  5727,
  7942,
  3335,
  8199,
  5649,
  7472,
  8154,
  6704,
  2207,
  2594,
  6262,
  8053,
  6203,
  1433,
  1228,
  2246,
  2143,
  1149,
  8188,
  455,
  4563,
  4285,
  7472,
  2160,
  8,
  308,
  4131,
  1653,
  6029,
  5041,
  4553,
  1147,
  4186,
  1994,
  8154,
  4313,
  3060,
  7472,
  2160,
  2160,
  6181,
  7983,
  3347,
  4648,
  8431,
  2207,
  8988,
  5293,
  8264,
  3821,
  706,
  6138,
  8727,
  2156,
  8191,
  1399,
  4292,
  4186,
  3507,
  7509,
  1214,
  6027,
  8520,
  3347,
  8473,
  5104,
  5527,
  2507,
  1228,
  4235,
  2822,
  2207,
  3166,
  8196,
  7636,
  1056,
  4030,
  29,
  517,
  5034,
  7156,
  6397,
  2207,
  6390,
  1521,
  843,
  4623,
  7156,
  8290,
  424,
  2346,
  424,
  5254,
  4941,
  3330,
  8068,
  7505,
  8414,
  7505,
  4061,
  7190,
  2207,
  2328,
  7983,
  744,
  652,
  5162,
  7169,
  5104,
  2678,
  3166,
  6877,
  8161,
  843,
  4901,
  1311,
  2244,
  2502,
  4056,
  46,
  3768,
  5434,
  4268,
  5327,
  4744,
  7204,
  3930,
  8801,
  2265,
  1157,
  2267,
  6097,
  2207,
  8801,
  3046,
  4130,
  4400,
  5616,
  3347,
  8473,
  1510,
  6657,
  2582,
  8871,
  5836,
  2578,
  3764,
  4673,
  1149,
  5385,
  5341,
  1182,
  3347,
  7457,
  88,
  8584,
  7983,
  1441,
  8935,
  8801,
  5588,
  2210,
  8584,
  3487,
  950,
  1852,
  5293,
  4656,
  3380,
  2466,
  4186,
  7156,
  7172,
  441,
  2578,
  8641,
  3023,
  7540,
  8849,
  2106,
  2854,
  6138,
  1157,
  8801,
  3152,
  5293,
  2207,
  7803,
  2207,
  4576,
  4623,
  3166,
  3098,
  386,
  4429,
  6262,
  5727,
  5412,
  2307,
  1250,
  3705,
  1420,
  8745,
  7101,
  3166,
  2822,
  5870,
  4999,
  5649,
  4746,
  7190,
  4553,
  4761,
  2113,
  3347,
  1856,
  7805,
  4601,
  5649,
  6719,
  8887,
  7381,
  8199,
  5261,
  6026,
  8757,
  4441,
  2694,
  1754,
  69,
  3172,
  7106,
  2000,
  8641,
  2781,
  4441,
  69,
  2360,
  4899,
  8255,
  6428,
  5832,
  2469,
  2542,
  3335,
  2937,
  2509,
  7974,
  ...],
 [4259,
  6985,
  7540,
  8795,
  824,
  5300,
  5152,
  5300,
  2425,
  1273,
  4971,
  60,
  6985,
  7540,
  2291,
  7721,
  4565,
  102,
  3037,
  2704,
  1363,
  7475,
  2522,
  1695,
  365,
  2773,
  3037,
  112,
  102,
  4565,
  7022,
  1666,
  4643,
  1816,
  470,
  1892,
  1695,
  4693,
  1911,
  8411,
  5383,
  7022,
  3719,
  3022,
  3626,
  8707,
  4623,
  1329,
  2426,
  5768,
  8262,
  2594,
  1772,
  5002,
  8707,
  7313,
  3765,
  1957,
  8971,
  132,
  3790,
  8564,
  1843,
  5198,
  7834,
  914,
  3792,
  3719,
  1022,
  3719,
  2346,
  233,
  855,
  1293,
  8707,
  3792,
  3719,
  1493,
  7659,
  2859,
  2758,
  3792,
  8707,
  104,
  5914,
  7192,
  4509,
  104,
  8530,
  4180,
  8707,
  102,
  3037,
  1695,
  3037,
  60,
  8453,
  5486,
  4062,
  2291,
  8453,
  1257,
  784,
  1709,
  60,
  1273,
  1166,
  3402,
  4259,
  4393,
  2286,
  2578,
  4062,
  1117,
  2975,
  60,
  5388,
  8439,
  8721,
  102,
  4565,
  7063,
  8721,
  5623,
  8721,
  3611,
  5152,
  3696,
  1479,
  6026,
  8721,
  2419,
  8721,
  5198,
  5623,
  8668,
  8721,
  7701,
  5798,
  102,
  3037,
  6804,
  4565,
  2419,
  4349,
  3930,
  2291,
  2522,
  7819,
  1695,
  7738,
  6862,
  7307,
  7974,
  2891,
  8149,
  2594,
  7443,
  2522,
  8278,
  5198,
  2522,
  4806,
  4766,
  6862,
  6390,
  441,
  5610,
  7721,
  2522,
  5293,
  2522,
  7834,
  4807,
  4766,
  7721,
  2522,
  5293,
  2522,
  823,
  6890,
  4893,
  7323,
  5974,
  6175,
  708,
  324,
  2786,
  36,
  2948,
  102,
  4565,
  4565,
  2988,
  4310,
  5102,
  5785,
  4766,
  914,
  7066,
  5785,
  4807,
  4766,
  60,
  607,
  3526,
  5858,
  724,
  5821,
  7859,
  3312,
  7191,
  571,
  60,
  2507,
  4893,
  4879,
  2786,
  8141,
  6313,
  5568,
  34,
  5241,
  1526,
  2786,
  1849,
  1745,
  5623,
  8846,
  5266,
  2716,
  2425,
  6914,
  90,
  4587,
  7465,
  1445,
  4540,
  8395,
  8609,
  6062,
  2786,
  4587,
  1557,
  3631,
  92,
  1695,
  3037,
  3037,
  3217,
  5778,
  724,
  6976,
  4623,
  2425,
  3957,
  7191,
  2425,
  5266,
  5858,
  8834,
  7272,
  6804,
  4565,
  102,
  3037,
  4565,
  1695,
  3037,
  2770,
  6804,
  5623,
  1455,
  2522,
  1695,
  98,
  6849,
  6862,
  7323,
  5974,
  1695,
  4807,
  4766,
  6804,
  5031,
  6862,
  7548,
  60,
  6985,
  2291,
  7721,
  365,
  2291,
  4565,
  1695,
  3037,
  8871,
  1745,
  7721,
  2522,
  8668,
  112,
  1111,
  102,
  1257,
  5592,
  4248,
  60,
  104,
  1117,
  2367,
  3792,
  102,
  3037,
  1695,
  3379,
  1022,
  1166,
  3816,
  4099,
  2958,
  1892,
  5104,
  3930,
  34,
  1166,
  8452,
  1892,
  6294,
  2704,
  2426,
  2291,
  7721,
  1793,
  1117,
  4610,
  1666,
  7592,
  6862,
  1695,
  6849,
  7454,
  3151,
  6862,
  5383,
  7022,
  5836,
  1666,
  5764,
  2291,
  470,
  1892,
  4693,
  1911,
  8411,
  1793,
  8117,
  1064,
  2291,
  3379,
  4272,
  3957,
  2425,
  2588,
  8653,
  1241,
  1293,
  34,
  2778,
  2965,
  1745,
  8721,
  34,
  5912,
  1257,
  1709,
  8418,
  2425,
  8188,
  5965,
  2786,
  7022,
  3719,
  60,
  5383,
  7272,
  4259,
  4099,
  2958,
  3535,
  3303,
  1096,
  8331,
  2758,
  3719,
  8820,
  1515,
  3897,
  5574,
  8903,
  633,
  3897,
  5780,
  1166,
  8707,
  4099,
  2958,
  1427,
  5857,
  6784,
  1445,
  7433,
  2425,
  2603,
  8707,
  2588,
  7022,
  1493,
  7272,
  3124,
  5946,
  104,
  5073,
  5452,
  3792,
  112,
  1614,
  4926,
  5623,
  4349,
  8721,
  95,
  104,
  7192,
  4970,
  8721,
  4544,
  5623,
  3250,
  95,
  4545,
  652,
  3574,
  7932,
  4807,
  4766,
  7108,
  5837,
  7272,
  5793,
  5623,
  4349,
  2522,
  102,
  4565,
  8834,
  7660,
  7721,
  2522,
  8191,
  4349,
  1695,
  365,
  2291,
  5398,
  1745,
  4349,
  823,
  4807,
  3347,
  4924,
  5554,
  8135,
  36,
  6156,
  36,
  8820,
  1111,
  102,
  7592,
  2875,
  652,
  4576,
  8834,
  5902,
  1892,
  5446,
  6662,
  4622,
  6604,
  1783,
  5385,
  3345,
  5793,
  4565,
  5641,
  3278,
  1768,
  1892,
  4544,
  202,
  8935,
  1892,
  4544,
  8935,
  441,
  1581,
  6553,
  1892,
  3644,
  2958,
  3644,
  1892,
  1653,
  7983,
  4099,
  4020,
  3644,
  95,
  5969,
  1892,
  5428,
  652,
  5626,
  731,
  3211,
  4616,
  4545,
  1892,
  8587,
  3278,
  1768,
  6316,
  2599,
  1666,
  1666,
  8,
  98,
  6849,
  6862,
  5974,
  7323,
  4766,
  6862,
  6877,
  8834,
  4789,
  7022,
  8862,
  7022,
  470,
  1892,
  4693,
  1911,
  8411,
  2342,
  1768,
  202,
  2522,
  1695,
  1695,
  4859,
  3574,
  7932,
  4481,
  2958,
  470,
  2846,
  1695,
  1911,
  4544,
  6862,
  8410,
  1400,
  470,
  7738,
  4766,
  6862,
  4729,
  523,
  1666,
  5533,
  8834,
  4207,
  8721,
  4693,
  1911,
  8411,
  6554,
  4693,
  1911,
  8411,
  8436,
  8455,
  5153,
  2875,
  4926,
  7263,
  2612,
  8521,
  1768,
  1183,
  8411,
  7714,
  3409,
  1695,
  8972,
  5920,
  7760,
  1695,
  2395,
  5847,
  4781,
  5785,
  4766,
  1695,
  7577,
  7819,
  1031,
  2475,
  36,
  2425,
  3957,
  953,
  3379,
  4272,
  2425,
  2595,
  4543,
  1241,
  1190,
  365,
  2779,
  4693,
  1911,
  8411,
  1695,
  470,
  1892,
  2779,
  7586,
  2507,
  4565,
  6804,
  2522,
  3513,
  325,
  3130,
  3409,
  4766,
  6804,
  6862,
  1695,
  4190,
  2779,
  7192,
  5293,
  2113,
  36,
  5385,
  7799,
  4926,
  4565,
  2779,
  4807,
  3347,
  3309,
  2113,
  36,
  2579,
  5113,
  2113,
  5197,
  652,
  2291,
  7738,
  2113,
  36,
  4565,
  5721,
  626,
  1745,
  1745,
  8721,
  4565,
  7114,
  5554,
  4565,
  2466,
  4565,
  626,
  8721,
  1653,
  3808,
  4272,
  3957,
  2425,
  8548,
  1954,
  1241,
  102,
  4565,
  7721,
  2522,
  1695,
  3037,
  34,
  2778,
  1849,
  1745,
  8721,
  4272,
  4842,
  2425,
  8333,
  424,
  8338,
  2425,
  4941,
  2704,
  4565,
  1695,
  2113,
  8017,
  5969,
  2425,
  7738,
  4893,
  814,
  4099,
  2958,
  1445,
  1892,
  652,
  7108,
  5088,
  7779,
  4426,
  7602,
  3184,
  3350,
  3176,
  5721,
  7597,
  8626,
  7983,
  2665,
  3005,
  5974,
  6262,
  3073,
  4708,
  7131,
  1790,
  7526,
  8992,
  2213,
  4708,
  7131,
  8834,
  1793,
  4545,
  2425,
  6134,
  8363,
  3409,
  5626,
  441,
  5981,
  8542,
  2089,
  1166,
  2758,
  4292,
  7505,
  8542,
  8530,
  8845,
  6912,
  8587,
  3164,
  6310,
  8038,
  5969,
  664,
  2042,
  1138,
  6238,
  3409,
  5131,
  7738,
  5023,
  7068,
  664,
  8871,
  919,
  197,
  7022,
  3535,
  652,
  4926,
  1686,
  1149,
  3409,
  3060,
  6428,
  8331,
  2758,
  5847,
  1420,
  104,
  7192,
  112,
  6214,
  6888,
  4688,
  2306,
  8707,
  8161,
  6719,
  8333,
  5898,
  6710,
  6428,
  3897,
  1793,
  1161,
  5383,
  1330,
  8331,
  2758,
  3897,
  2608,
  2153,
  412,
  8707,
  4693,
  1911,
  8411,
  652,
  2,
  7433,
  1445,
  1892,
  8455,
  4099,
  2958,
  1166,
  1445,
  1892,
  8455,
  1445,
  1892,
  8455,
  4099,
  7780,
  1445,
  1892,
  5857,
  1445,
  1892,
  8455,
  7780,
  1445,
  1892,
  5857,
  7738,
  566,
  7941,
  6548,
  36,
  3656,
  8452,
  6662,
  4893,
  7593,
  1445,
  1892,
  104,
  3656,
  1526,
  5623,
  5135,
  3656,
  1373,
  5565,
  796,
  8135,
  2887,
  407,
  104,
  2571,
  8975,
  2419,
  5198,
  8721,
  53,
  7002,
  8139,
  3897,
  2836,
  3529,
  1230,
  902,
  1230,
  708,
  5383,
  2302,
  1947,
  7433,
  8139,
  3060,
  8846,
  708,
  324,
  796,
  8139,
  708,
  324,
  4357,
  5764,
  8139,
  2113,
  8707,
  5798,
  1445,
  1892,
  1427,
  1445,
  1892,
  5798,
  1445,
  1892,
  1427,
  1445,
  1892,
  3897,
  8707,
  2425,
  8707,
  2425,
  1666,
  8317,
  7433,
  2425,
  2603,
  8707,
  2588,
  7022,
  1493,
  1117,
  7738,
  5857,
  6784,
  1445,
  1892,
  2425,
  8707,
  5969,
  708,
  324,
  8707,
  60,
  7102,
  2475,
  8161,
  708,
  324,
  843,
  36,
  8707,
  1493,
  8161,
  7022,
  6667,
  2466,
  3897,
  137,
  1695,
  8587,
  7662,
  5465,
  2425,
  1387,
  69,
  6798,
  441,
  8203,
  7819,
  1666,
  4529,
  1363,
  866,
  8320,
  4718,
  5785,
  4766,
  1695,
  6914,
  6238,
  ...]]

In [30]:
np.array(vecs.values()).shape


Out[30]:
(9007, 50)

Writing to file...it does not want things delmited by a comma

Also, no lists or anything, just new lines for a new word-vector or new document


In [23]:
with open("/Users/michael/Documents/GaussianLDA/dasnips.txt", 'w') as f:
    for doc in index_corpus:
#         f.write(str(doc).replace("[", "").replace("]","") + "\n")
        for word in doc:
            f.write(str(word) + " ")
        f.write("\n")

In [19]:
# lazy way of creating file that did not exist before
with open("/Users/michael/Documents/GaussianLDA/dasnips_vecs50.txt", 'w') as f:
    None

Saving the word-vectors to file


In [89]:
np.savetxt("/Users/michael/Documents/GaussianLDA/dasnips_vecs50.txt", np.array(vecs.values()))

Loading Das Results

We've run it, now lets look at the results


In [31]:
fp = "/Users/michael/Documents/Gaussian_LDA-master/output/"

In [28]:
!ls /Users/michael/Documents/Gaussian_LDA-master/output/sample50D2Ta


ls: /Users/michael/Documents/Gaussian_LDA-master/output/sample50D2Ta: No such file or directory

In [46]:
from collections import defaultdict

Super annoying to load the data in

must read data, turn it into floats, and throw into numpy array..

Only doing the means so we can examine them. Some instances, they come out all exactly the same.

Throws error b/c of junk with same name that the program outputs. Just ignore it there for now, it gets our topic params out just fine


In [81]:
das_data = defaultdict(dict)
counter = 1
for f in os.listdir("/Users/michael/Documents/Gaussian_LDA-master/output/"):
    if f.startswith("sample50D2Ta"):
        with open(fp+f, 'r') as datum:

#             print datum.readlines()[1:]
            topics = datum.readlines()
            das_data[counter]['mean'] = np.array([float(mean) for mean in topics[0].split()])
            das_data[counter]['cov'] = topics[1:]
            counter += 1


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-81-0eb2a31d484e> in <module>()
      7 #             print datum.readlines()[1:]
      8             topics = datum.readlines()
----> 9             das_data[counter]['mean'] = np.array([float(mean) for mean in topics[0].split()])
     10             das_data[counter]['cov'] = topics[1:]
     11             counter += 1

ValueError: could not convert string to float: Starting

In [82]:
das_data[1]['mean']

Checking rough meaning of each topic-mean

Using Gensim Word2Vec model's Most_Similar() method. This gives us a super quick and dirty idea of the peak of the distribution lays in the coordinate space. While this may not be informative of the covariance and what words it might produce, its fast + easy

More data munging just to look at it


In [90]:
for k in range(1, 51): # choose on number of topics.. only doing +1 since I was lazy and made it start at one, and not zero. 
    mean = das_data[k]['mean'].T
    print wvmodel.most_similar(positive=[mean])


[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'example', 0.9128363132476807), (u'particular', 0.9081135988235474), (u'rather', 0.8977897763252258), (u'certain', 0.8928133249282837), (u'instance', 0.8909444808959961), (u'same', 0.8892529010772705), (u'this', 0.886255145072937), (u'specific', 0.8861542344093323), (u'means', 0.8839278221130371), (u'similar', 0.8825365900993347)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]

In [65]:
for k in range(1, 51):
    das_data[k]['mean'] = np.array(das_data[k]['mean'])
    das_data[k]['cov'] = np.array(das_data[k]['cov'])

In [ ]:


In [ ]: